In [1]:
# Data manipulation and plotting modules
import numpy as np
import pandas as pd
from collections import Counter

# Data pre-processing
# z = (x-mean)/stdev
from sklearn.preprocessing import StandardScaler as ss

# Dimensionality reduction
from sklearn.decomposition import PCA

# Data splitting and model parameter search
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Modeling modules
from xgboost.sklearn import XGBClassifier

# Model pipelining
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

# Model evaluation metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import confusion_matrix

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.offline as pyo
import plotly.express as px
import plotly.graph_objs as go
pyo.init_notebook_mode()

import plotly.figure_factory as ff
from xgboost import plot_importance

# Needed for Bayes optimization
# Takes an estimator, performs cross-validation and gives out average scorefrom sklearn.model_selection import cross_val_score

# Misc
import time
import os
import gc
import random
# Used in Randomized parameter search
from scipy.stats import uniform
In [185]:
data = pd.read_csv("C:\\Users\\Lenovo\\Desktop\\Loan Details.csv")
In [3]:
data.shape    
Out[3]:
(67463, 35)
In [4]:
data.head()
Out[4]:
ID Loan Amount Funded Amount Funded Amount Investor Term Batch Enrolled Interest Rate Grade Sub Grade Employment Duration ... Recoveries Collection Recovery Fee Collection 12 months Medical Application Type Last week Pay Accounts Delinquent Total Collection Amount Total Current Balance Total Revolving Credit Limit Loan Status
0 65087372 10000 32236 12329.36286 59 BAT2522922 11.135007 B C4 MORTGAGE ... 2.498291 0.793724 0 INDIVIDUAL 49 0 31 311301 6619 0
1 1450153 3609 11940 12191.99692 59 BAT1586599 12.237563 C D3 RENT ... 2.377215 0.974821 0 INDIVIDUAL 109 0 53 182610 20885 0
2 1969101 28276 9311 21603.22455 59 BAT2136391 12.545884 F D4 MORTGAGE ... 4.316277 1.020075 0 INDIVIDUAL 66 0 34 89801 26155 0
3 6651430 11170 6954 17877.15585 59 BAT2428731 16.731201 C C3 MORTGAGE ... 0.107020 0.749971 0 INDIVIDUAL 39 0 40 9189 60214 0
4 14354669 16890 13226 13539.92667 59 BAT5341619 15.008300 C D4 MORTGAGE ... 1294.818751 0.368953 0 INDIVIDUAL 18 0 430 126029 22579 0

5 rows × 35 columns

In [5]:
data.nunique()
Out[5]:
ID                              67463
Loan Amount                     27525
Funded Amount                   24548
Funded Amount Investor          67441
Term                                3
Batch Enrolled                     41
Interest Rate                   67448
Grade                               7
Sub Grade                          35
Employment Duration                 3
Home Ownership                  67454
Verification Status                 3
Payment Plan                        1
Loan Title                        109
Debit to Income                 67454
Delinquency - two years             9
Inquires - six months               6
Open Account                       36
Public Record                       5
Revolving Balance               20582
Revolving Utilities             67458
Total Accounts                     69
Initial List Status                 2
Total Received Interest         67451
Total Received Late Fee         67380
Recoveries                      67387
Collection Recovery Fee         67313
Collection 12 months Medical        2
Application Type                    2
Last week Pay                     162
Accounts Delinquent                 1
Total Collection Amount          2193
Total Current Balance           60901
Total Revolving Credit Limit    37708
Loan Status                         2
dtype: int64
In [6]:
data.isnull().sum()
Out[6]:
ID                              0
Loan Amount                     0
Funded Amount                   0
Funded Amount Investor          0
Term                            0
Batch Enrolled                  0
Interest Rate                   0
Grade                           0
Sub Grade                       0
Employment Duration             0
Home Ownership                  0
Verification Status             0
Payment Plan                    0
Loan Title                      0
Debit to Income                 0
Delinquency - two years         0
Inquires - six months           0
Open Account                    0
Public Record                   0
Revolving Balance               0
Revolving Utilities             0
Total Accounts                  0
Initial List Status             0
Total Received Interest         0
Total Received Late Fee         0
Recoveries                      0
Collection Recovery Fee         0
Collection 12 months Medical    0
Application Type                0
Last week Pay                   0
Accounts Delinquent             0
Total Collection Amount         0
Total Current Balance           0
Total Revolving Credit Limit    0
Loan Status                     0
dtype: int64
In [7]:
data.describe()
Out[7]:
ID Loan Amount Funded Amount Funded Amount Investor Term Interest Rate Home Ownership Debit to Income Delinquency - two years Inquires - six months ... Total Received Late Fee Recoveries Collection Recovery Fee Collection 12 months Medical Last week Pay Accounts Delinquent Total Collection Amount Total Current Balance Total Revolving Credit Limit Loan Status
count 6.746300e+04 67463.000000 67463.000000 67463.000000 67463.000000 67463.000000 67463.000000 67463.000000 67463.000000 67463.000000 ... 67463.000000 67463.000000 67463.000000 67463.000000 67463.000000 67463.0 67463.000000 6.746300e+04 67463.000000 67463.000000
mean 2.562761e+07 16848.902776 15770.599114 14621.799323 58.173814 11.846258 80541.502522 23.299241 0.327127 0.145754 ... 1.143969 59.691578 1.125141 0.021301 71.163260 0.0 146.467990 1.595739e+05 23123.005544 0.092510
std 2.109155e+07 8367.865726 8150.992662 6785.345170 3.327441 3.718629 45029.120366 8.451824 0.800888 0.473291 ... 5.244365 357.026346 3.489885 0.144385 43.315845 0.0 744.382233 1.390332e+05 20916.699999 0.289747
min 1.297933e+06 1014.000000 1014.000000 1114.590204 36.000000 5.320006 14573.537170 0.675299 0.000000 0.000000 ... 0.000003 0.000036 0.000036 0.000000 0.000000 0.0 1.000000 6.170000e+02 1000.000000 0.000000
25% 6.570288e+06 10012.000000 9266.500000 9831.684984 58.000000 9.297147 51689.843335 16.756416 0.000000 0.000000 ... 0.021114 1.629818 0.476259 0.000000 35.000000 0.0 24.000000 5.037900e+04 8155.500000 0.000000
50% 1.791565e+07 16073.000000 13042.000000 12793.682170 59.000000 11.377696 69335.832680 22.656658 0.000000 0.000000 ... 0.043398 3.344524 0.780141 0.000000 68.000000 0.0 36.000000 1.183690e+05 16733.000000 0.000000
75% 4.271521e+07 22106.000000 21793.000000 17807.594120 59.000000 14.193533 94623.322785 30.048400 0.000000 0.000000 ... 0.071884 5.453727 1.070566 0.000000 105.000000 0.0 46.000000 2.283750e+05 32146.500000 0.000000
max 7.224578e+07 35000.000000 34999.000000 34999.746430 59.000000 27.182348 406561.536400 39.629862 8.000000 5.000000 ... 42.618882 4354.467419 166.833000 1.000000 161.000000 0.0 16421.000000 1.177412e+06 201169.000000 1.000000

8 rows × 26 columns

In [8]:
data.dtypes
Out[8]:
ID                                int64
Loan Amount                       int64
Funded Amount                     int64
Funded Amount Investor          float64
Term                              int64
Batch Enrolled                   object
Interest Rate                   float64
Grade                            object
Sub Grade                        object
Employment Duration              object
Home Ownership                  float64
Verification Status              object
Payment Plan                     object
Loan Title                       object
Debit to Income                 float64
Delinquency - two years           int64
Inquires - six months             int64
Open Account                      int64
Public Record                     int64
Revolving Balance                 int64
Revolving Utilities             float64
Total Accounts                    int64
Initial List Status              object
Total Received Interest         float64
Total Received Late Fee         float64
Recoveries                      float64
Collection Recovery Fee         float64
Collection 12 months Medical      int64
Application Type                 object
Last week Pay                     int64
Accounts Delinquent               int64
Total Collection Amount           int64
Total Current Balance             int64
Total Revolving Credit Limit      int64
Loan Status                       int64
dtype: object
In [9]:
import seaborn as sns
ax=sns.countplot(x='Loan Status',data=data);
In [10]:
# Finding Correlation,
corrMatrix=data.corr()
corrMatrix 
Out[10]:
ID Loan Amount Funded Amount Funded Amount Investor Term Interest Rate Home Ownership Debit to Income Delinquency - two years Inquires - six months ... Total Received Late Fee Recoveries Collection Recovery Fee Collection 12 months Medical Last week Pay Accounts Delinquent Total Collection Amount Total Current Balance Total Revolving Credit Limit Loan Status
ID 1.000000 -0.003480 -0.003302 0.002954 0.003226 0.004258 -0.004390 -0.010578 0.000568 -0.006628 ... 0.006674 -0.001435 -0.001802 -0.002900 0.000907 NaN 0.003745 -0.003572 -0.005141 0.000472
Loan Amount -0.003480 1.000000 -0.000551 0.002831 0.004277 -0.004888 0.016691 0.007959 -0.000469 0.008962 ... -0.000034 -0.001606 -0.002142 -0.002726 -0.002362 NaN -0.004135 -0.008285 0.002289 -0.004473
Funded Amount -0.003302 -0.000551 1.000000 0.010227 -0.001503 0.002310 -0.003518 0.002347 0.011313 -0.001587 ... 0.001542 0.000462 0.000175 0.001071 -0.003476 NaN -0.002821 -0.001499 0.006145 0.001364
Funded Amount Investor 0.002954 0.002831 0.010227 1.000000 -0.008943 -0.001917 0.001339 0.000112 0.001925 -0.003073 ... -0.000232 0.000966 -0.007272 0.001814 0.004248 NaN 0.006862 0.003283 0.005669 -0.000091
Term 0.003226 0.004277 -0.001503 -0.008943 1.000000 -0.012688 -0.021813 0.001026 -0.004494 -0.005272 ... -0.000736 -0.003475 -0.001203 -0.003263 0.007035 NaN 0.000358 0.003361 -0.005068 -0.003410
Interest Rate 0.004258 -0.004888 0.002310 -0.001917 -0.012688 1.000000 0.005467 -0.011203 0.004045 0.009172 ... 0.003119 0.009348 0.001281 -0.009895 -0.012652 NaN 0.002771 -0.002567 0.016651 0.002900
Home Ownership -0.004390 0.016691 -0.003518 0.001339 -0.021813 0.005467 1.000000 0.022781 -0.003793 0.005024 ... 0.004011 0.004399 -0.003821 -0.002639 -0.004573 NaN 0.006314 0.007117 0.005008 0.003716
Debit to Income -0.010578 0.007959 0.002347 0.000112 0.001026 -0.011203 0.022781 1.000000 -0.004007 0.002246 ... -0.010224 -0.009693 0.002200 -0.000716 0.008639 NaN 0.001555 -0.011582 -0.007236 -0.003057
Delinquency - two years 0.000568 -0.000469 0.011313 0.001925 -0.004494 0.004045 -0.003793 -0.004007 1.000000 0.014679 ... 0.007943 0.017348 0.002707 0.003451 0.001160 NaN 0.005642 0.002602 0.009315 0.009990
Inquires - six months -0.006628 0.008962 -0.001587 -0.003073 -0.005272 0.009172 0.005024 0.002246 0.014679 1.000000 ... 0.008296 0.012487 0.008388 -0.004436 -0.004453 NaN -0.002210 0.001531 0.004678 0.000578
Open Account -0.002781 0.009088 0.005755 -0.007850 0.021362 -0.003250 0.009080 0.001100 0.004904 -0.002109 ... 0.000828 -0.001216 -0.006832 0.003397 -0.013415 NaN -0.006176 -0.007470 0.003096 -0.007073
Public Record 0.009830 -0.002542 0.003750 0.005002 -0.002827 0.006979 0.005631 -0.007813 0.006716 0.004087 ... 0.016594 0.008905 0.004725 0.008878 0.004281 NaN 0.012928 0.003935 0.012046 0.010590
Revolving Balance 0.004315 -0.001738 -0.004485 -0.009102 -0.002317 0.018999 0.016783 -0.011414 0.009394 0.002457 ... 0.004903 0.005056 -0.003939 0.007516 -0.010279 NaN 0.004282 -0.007537 0.023366 -0.001073
Revolving Utilities 0.000910 0.014828 0.004460 -0.003027 -0.010018 0.006089 -0.005556 0.003691 0.002474 0.005150 ... -0.001363 -0.002381 -0.001952 0.010783 0.007961 NaN 0.006067 -0.019785 -0.009818 0.004120
Total Accounts 0.001518 -0.002071 0.008298 0.003191 0.001204 0.006584 0.021452 -0.005683 -0.003085 0.007562 ... 0.004910 -0.002171 0.007701 0.002538 0.015405 NaN 0.005593 -0.006937 0.032492 0.000222
Total Received Interest -0.002938 -0.001887 0.002759 0.001432 0.008663 0.006998 -0.010346 0.006504 -0.004511 0.009556 ... 0.002507 -0.000717 0.003921 0.004759 0.002643 NaN 0.001027 0.001374 0.012015 0.001680
Total Received Late Fee 0.006674 -0.000034 0.001542 -0.000232 -0.000736 0.003119 0.004011 -0.010224 0.007943 0.008296 ... 1.000000 0.007992 0.004856 0.003590 0.001776 NaN 0.007441 -0.000526 0.014839 0.009365
Recoveries -0.001435 -0.001606 0.000462 0.000966 -0.003475 0.009348 0.004399 -0.009693 0.017348 0.012487 ... 0.007992 1.000000 0.008328 0.007874 -0.001787 NaN 0.004703 -0.000488 -0.000146 -0.000652
Collection Recovery Fee -0.001802 -0.002142 0.000175 -0.007272 -0.001203 0.001281 -0.003821 0.002200 0.002707 0.008388 ... 0.004856 0.008328 1.000000 0.007689 0.001106 NaN 0.001800 0.002851 -0.000208 -0.003828
Collection 12 months Medical -0.002900 -0.002726 0.001071 0.001814 -0.003263 -0.009895 -0.002639 -0.000716 0.003451 -0.004436 ... 0.003590 0.007874 0.007689 1.000000 0.005540 NaN 0.004768 0.001728 -0.001033 -0.000686
Last week Pay 0.000907 -0.002362 -0.003476 0.004248 0.007035 -0.012652 -0.004573 0.008639 0.001160 -0.004453 ... 0.001776 -0.001787 0.001106 0.005540 1.000000 NaN 0.001457 0.001147 -0.001583 0.006117
Accounts Delinquent NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Total Collection Amount 0.003745 -0.004135 -0.002821 0.006862 0.000358 0.002771 0.006314 0.001555 0.005642 -0.002210 ... 0.007441 0.004703 0.001800 0.004768 0.001457 NaN 1.000000 0.005318 0.006541 0.007892
Total Current Balance -0.003572 -0.008285 -0.001499 0.003283 0.003361 -0.002567 0.007117 -0.011582 0.002602 0.001531 ... -0.000526 -0.000488 0.002851 0.001728 0.001147 NaN 0.005318 1.000000 0.004865 0.009828
Total Revolving Credit Limit -0.005141 0.002289 0.006145 0.005669 -0.005068 0.016651 0.005008 -0.007236 0.009315 0.004678 ... 0.014839 -0.000146 -0.000208 -0.001033 -0.001583 NaN 0.006541 0.004865 1.000000 0.001454
Loan Status 0.000472 -0.004473 0.001364 -0.000091 -0.003410 0.002900 0.003716 -0.003057 0.009990 0.000578 ... 0.009365 -0.000652 -0.003828 -0.000686 0.006117 NaN 0.007892 0.009828 0.001454 1.000000

26 rows × 26 columns

In [11]:
ax= plt.axes()
_=sns.heatmap(data.drop(columns=['Total Current Balance']).corr(),cmap='Blues',cbar=None,ax=ax)
_=ax.set_title('Correlation Heatmap')
In [12]:
plt.figure(figsize=(5,5))
a = sns.barplot(x="Grade",y="Total Collection Amount",data=data)

Feature Engineering¶

In [186]:
data["Loan_qty"] = pd.qcut(
                               data['Loan Amount'],
                               q = 3,
                               labels= ["low", "medium", "high"]
                               )
In [14]:
data.Loan_qty.value_counts()
Out[14]:
low       22489
medium    22488
high      22486
Name: Loan_qty, dtype: int64
In [187]:
data.drop(labels='ID', axis=1, inplace=True)
data.drop(labels='Batch Enrolled', axis=1, inplace=True)
data.drop(['Debit to Income','Application Type'], axis=1, inplace=True)
In [16]:
data.head()
Out[16]:
Loan Amount Funded Amount Funded Amount Investor Term Interest Rate Grade Sub Grade Employment Duration Home Ownership Verification Status ... Recoveries Collection Recovery Fee Collection 12 months Medical Last week Pay Accounts Delinquent Total Collection Amount Total Current Balance Total Revolving Credit Limit Loan Status Loan_qty
0 10000 32236 12329.36286 59 11.135007 B C4 MORTGAGE 176346.62670 Not Verified ... 2.498291 0.793724 0 49 0 31 311301 6619 0 low
1 3609 11940 12191.99692 59 12.237563 C D3 RENT 39833.92100 Source Verified ... 2.377215 0.974821 0 109 0 53 182610 20885 0 low
2 28276 9311 21603.22455 59 12.545884 F D4 MORTGAGE 91506.69105 Source Verified ... 4.316277 1.020075 0 66 0 34 89801 26155 0 high
3 11170 6954 17877.15585 59 16.731201 C C3 MORTGAGE 108286.57590 Source Verified ... 0.107020 0.749971 0 39 0 40 9189 60214 0 low
4 16890 13226 13539.92667 59 15.008300 C D4 MORTGAGE 44234.82545 Source Verified ... 1294.818751 0.368953 0 18 0 430 126029 22579 0 medium

5 rows × 32 columns

In [17]:
# Data pre-processing
# Data splitting and model parameter search
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

# Hyperparameter optimization
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
import numpy as np
import pandas as pd
from time import time
import pprint
import joblib
import warnings
warnings.filterwarnings("ignore")

# Classifiers
from catboost import CatBoostClassifier

# Model selection
from sklearn.model_selection import StratifiedKFold

# Metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer

# Skopt functions
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, VerboseCallback, DeltaXStopper
from skopt.space import Real, Categorical, Integer
from time import time
In [18]:
#     z = (x-mean)/stdev
from sklearn.preprocessing import StandardScaler as ss

# Dimensionality reduction and noise removal
from sklearn.decomposition import PCA
from xgboost.sklearn import XGBClassifier

# Model pipelining
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

# Hyperparameter optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
In [19]:
# For plotting
import matplotlib.pyplot as plt
In [20]:
# Model evaluation metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import confusion_matrix
In [21]:
# From a cell display outputs from multiple commands:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
In [22]:
# To check train data file total columns and rows
print("\n Shape")
data.shape                # 67463, 33)

#To check total columns
print("\n\n Columns")
data.columns.values      
print("\n")

#To check Data types
print("\n\nData types")                        
data.dtypes.value_counts()  
print("\n")
data.head(3)
 Shape
Out[22]:
(67463, 32)

 Columns
Out[22]:
array(['Loan Amount', 'Funded Amount', 'Funded Amount Investor', 'Term',
       'Interest Rate', 'Grade', 'Sub Grade', 'Employment Duration',
       'Home Ownership', 'Verification Status', 'Payment Plan',
       'Loan Title', 'Delinquency - two years', 'Inquires - six months',
       'Open Account', 'Public Record', 'Revolving Balance',
       'Revolving Utilities', 'Total Accounts', 'Initial List Status',
       'Total Received Interest', 'Total Received Late Fee', 'Recoveries',
       'Collection Recovery Fee', 'Collection 12 months Medical',
       'Last week Pay', 'Accounts Delinquent', 'Total Collection Amount',
       'Total Current Balance', 'Total Revolving Credit Limit',
       'Loan Status', 'Loan_qty'], dtype=object)



Data types
Out[22]:
int64       16
float64      8
object       7
category     1
dtype: int64

Out[22]:
Loan Amount Funded Amount Funded Amount Investor Term Interest Rate Grade Sub Grade Employment Duration Home Ownership Verification Status ... Recoveries Collection Recovery Fee Collection 12 months Medical Last week Pay Accounts Delinquent Total Collection Amount Total Current Balance Total Revolving Credit Limit Loan Status Loan_qty
0 10000 32236 12329.36286 59 11.135007 B C4 MORTGAGE 176346.62670 Not Verified ... 2.498291 0.793724 0 49 0 31 311301 6619 0 low
1 3609 11940 12191.99692 59 12.237563 C D3 RENT 39833.92100 Source Verified ... 2.377215 0.974821 0 109 0 53 182610 20885 0 low
2 28276 9311 21603.22455 59 12.545884 F D4 MORTGAGE 91506.69105 Source Verified ... 4.316277 1.020075 0 66 0 34 89801 26155 0 high

3 rows × 32 columns

In [23]:
cat = ['Grade','Sub Grade','Employment Duration','Verification Status','Payment Plan','Loan Title','Initial List Status','Loan_qty']
In [24]:
# Import label encoder 
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
data[cat]=data[cat].apply(label_encoder.fit_transform)
In [25]:
data.head()
Out[25]:
Loan Amount Funded Amount Funded Amount Investor Term Interest Rate Grade Sub Grade Employment Duration Home Ownership Verification Status ... Recoveries Collection Recovery Fee Collection 12 months Medical Last week Pay Accounts Delinquent Total Collection Amount Total Current Balance Total Revolving Credit Limit Loan Status Loan_qty
0 10000 32236 12329.36286 59 11.135007 1 13 0 176346.62670 0 ... 2.498291 0.793724 0 49 0 31 311301 6619 0 1
1 3609 11940 12191.99692 59 12.237563 2 17 2 39833.92100 1 ... 2.377215 0.974821 0 109 0 53 182610 20885 0 1
2 28276 9311 21603.22455 59 12.545884 5 18 0 91506.69105 1 ... 4.316277 1.020075 0 66 0 34 89801 26155 0 0
3 11170 6954 17877.15585 59 16.731201 2 12 0 108286.57590 1 ... 0.107020 0.749971 0 39 0 40 9189 60214 0 1
4 16890 13226 13539.92667 59 15.008300 2 18 0 44234.82545 1 ... 1294.818751 0.368953 0 18 0 430 126029 22579 0 2

5 rows × 32 columns

PIPELINING¶

In [82]:
#  Divide data into predictors and target
#     First 10 columns are predictors
X = data.iloc[ :, 0:30]
X.head(2)

# 11st column is target
print("\n\nTarget,y, values")
y = data.iloc[ : , 30]
y.head()
Out[82]:
Loan Amount Funded Amount Funded Amount Investor Term Interest Rate Grade Sub Grade Employment Duration Home Ownership Verification Status ... Total Received Interest Total Received Late Fee Recoveries Collection Recovery Fee Collection 12 months Medical Last week Pay Accounts Delinquent Total Collection Amount Total Current Balance Total Revolving Credit Limit
0 10000 32236 12329.36286 59 11.135007 1 13 0 176346.6267 0 ... 2929.646315 0.102055 2.498291 0.793724 0 49 0 31 311301 6619
1 3609 11940 12191.99692 59 12.237563 2 17 2 39833.9210 1 ... 772.769385 0.036181 2.377215 0.974821 0 109 0 53 182610 20885

2 rows × 30 columns


Target,y, values
Out[82]:
0    0
1    0
2    0
3    0
4    0
Name: Loan Status, dtype: int64
In [83]:
#  Split dataset into train and validation parts
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    shuffle = True,
                                                    stratify = y
                                                    )


X_train.shape        
X_test.shape         
y_train.shape        
y_test.shape
Out[83]:
(53970, 30)
Out[83]:
(13493, 30)
Out[83]:
(53970,)
Out[83]:
(13493,)
In [84]:
#Creating pipeline
#Pipe using XGBoost
steps_xg = [('sts', ss() ),
            ('pca', PCA()),
            ('xg',  XGBClassifier(silent = False,
                                  n_jobs=3)        # Specify other parameters here
            )
            ]

# Instantiate Pipeline object
pipe_xg = Pipeline(steps_xg)
In [85]:
# Grid Search code to discover best pipeline parameters
print("\n\n--Which parameters can be tuned?--\n\n")
pipe_xg.get_params()

--Which parameters can be tuned?--


Out[85]:
{'memory': None,
 'steps': [('sts', StandardScaler()),
  ('pca', PCA()),
  ('xg',
   XGBClassifier(base_score=None, booster=None, callbacks=None,
                 colsample_bylevel=None, colsample_bynode=None,
                 colsample_bytree=None, early_stopping_rounds=None,
                 enable_categorical=False, eval_metric=None, gamma=None,
                 gpu_id=None, grow_policy=None, importance_type=None,
                 interaction_constraints=None, learning_rate=None, max_bin=None,
                 max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
                 max_leaves=None, min_child_weight=None, missing=nan,
                 monotone_constraints=None, n_estimators=100, n_jobs=3,
                 num_parallel_tree=None, predictor=None, random_state=None,
                 reg_alpha=None, reg_lambda=None, ...))],
 'verbose': False,
 'sts': StandardScaler(),
 'pca': PCA(),
 'xg': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, gamma=None,
               gpu_id=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=None, max_bin=None,
               max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
               max_leaves=None, min_child_weight=None, missing=nan,
               monotone_constraints=None, n_estimators=100, n_jobs=3,
               num_parallel_tree=None, predictor=None, random_state=None,
               reg_alpha=None, reg_lambda=None, ...),
 'sts__copy': True,
 'sts__with_mean': True,
 'sts__with_std': True,
 'pca__copy': True,
 'pca__iterated_power': 'auto',
 'pca__n_components': None,
 'pca__n_oversamples': 10,
 'pca__power_iteration_normalizer': 'auto',
 'pca__random_state': None,
 'pca__svd_solver': 'auto',
 'pca__tol': 0.0,
 'pca__whiten': False,
 'xg__objective': 'binary:logistic',
 'xg__use_label_encoder': False,
 'xg__base_score': None,
 'xg__booster': None,
 'xg__callbacks': None,
 'xg__colsample_bylevel': None,
 'xg__colsample_bynode': None,
 'xg__colsample_bytree': None,
 'xg__early_stopping_rounds': None,
 'xg__enable_categorical': False,
 'xg__eval_metric': None,
 'xg__gamma': None,
 'xg__gpu_id': None,
 'xg__grow_policy': None,
 'xg__importance_type': None,
 'xg__interaction_constraints': None,
 'xg__learning_rate': None,
 'xg__max_bin': None,
 'xg__max_cat_to_onehot': None,
 'xg__max_delta_step': None,
 'xg__max_depth': None,
 'xg__max_leaves': None,
 'xg__min_child_weight': None,
 'xg__missing': nan,
 'xg__monotone_constraints': None,
 'xg__n_estimators': 100,
 'xg__n_jobs': 3,
 'xg__num_parallel_tree': None,
 'xg__predictor': None,
 'xg__random_state': None,
 'xg__reg_alpha': None,
 'xg__reg_lambda': None,
 'xg__sampling_method': None,
 'xg__scale_pos_weight': None,
 'xg__subsample': None,
 'xg__tree_method': None,
 'xg__validate_parameters': None,
 'xg__verbosity': None,
 'xg__silent': False}
In [97]:
# Deifne dictionary
parameters = {'xg__learning_rate':  [0.03, 0.05], # learning rate decides what percentage
                                                  #  of error is to be fitted by
                                                  #   by next boosted tree.
                                                  # See this answer in stackoverflow:
                                                  # https://stats.stackexchange.com/questions/354484/why-does-xgboost-have-a-learning-rate
                                                  # Coefficients of boosted trees decide,
                                                  #  in the overall model or scheme, how much importance
                                                  #   each boosted tree shall have. Values of these
                                                  #    Coefficients are calculated by modeling
                                                  #     algorithm and unlike learning rate are
                                                  #      not hyperparameters. These Coefficients
                                                  #       get adjusted by l1 and l2 parameters
              'xg__n_estimators':   [50,  100],  # Number of boosted trees to fit
                                                  # l1 and l2 specifications will change
                                                  # the values of coeff of boosted trees
                                                  # but not their numbers

              'xg__max_depth':      [10,6],
              'pca__n_components' : [30,5],
              'xg__booster': ['gbtree','gblinear']
              }
In [98]:
#  Define revised dictionary
parameters = {'xg__learning_rate':  [0.03, 0.05], # learning rate decides what percentage
                                                  #  of error is to be fitted by
                                                  #   by next boosted tree.
                                                  # See this answer in stackoverflow:
                                                  # https://stats.stackexchange.com/questions/354484/why-does-xgboost-have-a-learning-rate
                                                  # Coefficients of boosted trees decide,
                                                  #  in the overall model or scheme, how much importance
                                                  #   each boosted tree shall have. Values of these
                                                  #    Coefficients are calculated by modeling
                                                  #     algorithm and unlike learning rate are
                                                  #      not hyperparameters. These Coefficients
                                                  #       get adjusted by l1 and l2 parameters
               'xg__max_depth':      [10,6],
               'pca__n_components' : [30,5],
               }          
In [99]:
#### Instantiating GridSearchCV class
clf = GridSearchCV(pipe_xg,            # pipeline object
                   parameters,         # possible parameters
                   n_jobs = 2,         # USe parallel cpu threads
                   cv =2 ,             # No of folds
                   verbose =1,         # Higher the value, more the verbosity
                   scoring = ['accuracy', 'roc_auc'],  # Metrics for performance
                   refit = 'roc_auc'   # Refitting final model on what parameters?
                                       # Those which maximise auc
                   )
In [100]:
from time import sleep
from time import *             #meaning from time import EVERYTHING
import time
In [ ]:
 
In [101]:
# Start fitting pipeline to data
print("\n\n--Takes time...---\n")
start = time.time()
clf.fit(X_train, y_train)
end = time.time()
print()
(end - start)/60

--Takes time...---

Fitting 2 folds for each of 8 candidates, totalling 16 fits
[22:33:41] WARNING: C:/Users/administrator/workspace/xgboost-win64_release_1.6.0/src/learner.cc:627: 
Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Out[101]:
GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('sts', StandardScaler()),
                                       ('pca', PCA()),
                                       ('xg',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      callbacks=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      early_stopping_rounds=None,
                                                      enable_categorical=False,
                                                      eval_metric=None,
                                                      gamma=None, gpu_id=None,
                                                      grow_policy=None,
                                                      importance_type=None,
                                                      intera...
                                                      max_leaves=None,
                                                      min_child_weight=None,
                                                      missing=nan,
                                                      monotone_constraints=None,
                                                      n_estimators=100,
                                                      n_jobs=3,
                                                      num_parallel_tree=None,
                                                      predictor=None,
                                                      random_state=None,
                                                      reg_alpha=None,
                                                      reg_lambda=None, ...))]),
             n_jobs=2,
             param_grid={'pca__n_components': [30, 5],
                         'xg__learning_rate': [0.03, 0.05],
                         'xg__max_depth': [10, 6]},
             refit='roc_auc', scoring=['accuracy', 'roc_auc'], verbose=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('sts', StandardScaler()),
                                       ('pca', PCA()),
                                       ('xg',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      callbacks=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      early_stopping_rounds=None,
                                                      enable_categorical=False,
                                                      eval_metric=None,
                                                      gamma=None, gpu_id=None,
                                                      grow_policy=None,
                                                      importance_type=None,
                                                      intera...
                                                      max_leaves=None,
                                                      min_child_weight=None,
                                                      missing=nan,
                                                      monotone_constraints=None,
                                                      n_estimators=100,
                                                      n_jobs=3,
                                                      num_parallel_tree=None,
                                                      predictor=None,
                                                      random_state=None,
                                                      reg_alpha=None,
                                                      reg_lambda=None, ...))]),
             n_jobs=2,
             param_grid={'pca__n_components': [30, 5],
                         'xg__learning_rate': [0.03, 0.05],
                         'xg__max_depth': [10, 6]},
             refit='roc_auc', scoring=['accuracy', 'roc_auc'], verbose=1)
Pipeline(steps=[('sts', StandardScaler()), ('pca', PCA()),
                ('xg',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               gamma=None, gpu_id=None, grow_policy=None,
                               importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_bin=None, max_cat_to_onehot=None,
                               max_delta_step=None, max_depth=None,
                               max_leaves=None, min_child_weight=None,
                               missing=nan, monotone_constraints=None,
                               n_estimators=100, n_jobs=3,
                               num_parallel_tree=None, predictor=None,
                               random_state=None, reg_alpha=None,
                               reg_lambda=None, ...))])
StandardScaler()
PCA()
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, gamma=None,
              gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=3,
              num_parallel_tree=None, predictor=None, random_state=None,
              reg_alpha=None, reg_lambda=None, ...)

Out[101]:
2.2357513189315794
In [102]:
# Make predictions using the best returned model
y_pred = clf.predict(X_test)
In [103]:
# 7.5 Accuracy score
print("\n\n--Accuracy Score--\n")
accuracy = accuracy_score(y_test, y_pred)
accuracy * 100

--Accuracy Score--

Out[103]:
90.7507596531535
In [104]:
# F1 score
print("\n\n--F1 Score ")
f1_score(y_test,y_pred,average ='micro')      


# 7.8 ROC curve and get AUC
print("\n\n--ROC curve--\n")
_=plot_roc_curve(clf, X_test, y_test)

--F1 Score 
Out[104]:
0.9075075965315349

--ROC curve--

In [105]:
#  Get feature importances from GridSearchCV best fitted 'xg' model
clf.best_estimator_.named_steps["xg"].feature_importances_.shape


print("\n\n---Feature importances---\n")
clf.best_estimator_.named_steps["xg"].feature_importances_
Out[105]:
(30,)

---Feature importances---

Out[105]:
array([0.03265371, 0.03472751, 0.0308095 , 0.03223895, 0.03245703,
       0.03357385, 0.03266273, 0.03215893, 0.03352813, 0.03415863,
       0.03453534, 0.03328626, 0.03297805, 0.03232114, 0.03327659,
       0.03340273, 0.03570846, 0.03412129, 0.03340523, 0.03336824,
       0.03346148, 0.03315647, 0.03417563, 0.03429782, 0.03236499,
       0.03339149, 0.03259033, 0.03415657, 0.03439463, 0.0326382 ],
      dtype=float32)
In [106]:
colnames = X.columns.tolist()
In [107]:
#  Create a dataframe of feature importances
#      with names of columns and sorted by feature-imp
imp_values = clf.best_estimator_.named_steps["xg"].feature_importances_

df_imp = pd.DataFrame(
                      data = imp_values,
                      index = colnames,
                      columns = ["imp"]
                      ).sort_values(by = 'imp')


df_imp
Out[107]:
imp
Funded Amount Investor 0.030810
Employment Duration 0.032159
Term 0.032239
Inquires - six months 0.032321
Collection 12 months Medical 0.032365
Interest Rate 0.032457
Accounts Delinquent 0.032590
Total Revolving Credit Limit 0.032638
Loan Amount 0.032654
Sub Grade 0.032663
Delinquency - two years 0.032978
Total Received Late Fee 0.033156
Open Account 0.033277
Loan Title 0.033286
Initial List Status 0.033368
Last week Pay 0.033391
Public Record 0.033403
Total Accounts 0.033405
Total Received Interest 0.033461
Home Ownership 0.033528
Grade 0.033574
Revolving Utilities 0.034121
Total Collection Amount 0.034157
Verification Status 0.034159
Recoveries 0.034176
Collection Recovery Fee 0.034298
Total Current Balance 0.034395
Payment Plan 0.034535
Funded Amount 0.034728
Revolving Balance 0.035708
In [108]:
#  First five columns with least feature importance are:
list(df_imp.index.values[:5])
Out[108]:
['Funded Amount Investor',
 'Employment Duration',
 'Term',
 'Inquires - six months',
 'Collection 12 months Medical']
In [109]:
# Let us drop these from X_train and X_test
Xtrain = X_train.drop(columns = list(df_imp.index.values[:5]))
Xtest = X_test.drop(columns = list(df_imp.index.values[:5]))

# Build model again with reduced dataset
clf_dr = GridSearchCV(pipe_xg,            # pipeline object
                      parameters,         # possible parameters
                      n_jobs = 2,         # USe parallel cpu threads
                      cv =3 ,             # No of folds
                      verbose =2,         # Higher the value, more the verbosity
                      scoring = ['accuracy', 'roc_auc'],  # Metrics for performance
                      refit = 'roc_auc'   # Those which maximise auc
                     )

           
start = time.time()
clf_dr.fit(Xtrain, y_train)
end = time.time()
(end - start)/60
Fitting 3 folds for each of 8 candidates, totalling 24 fits
[22:34:56] WARNING: C:/Users/administrator/workspace/xgboost-win64_release_1.6.0/src/learner.cc:627: 
Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Out[109]:
GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('sts', StandardScaler()),
                                       ('pca', PCA()),
                                       ('xg',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      callbacks=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      early_stopping_rounds=None,
                                                      enable_categorical=False,
                                                      eval_metric=None,
                                                      gamma=None, gpu_id=None,
                                                      grow_policy=None,
                                                      importance_type=None,
                                                      intera...
                                                      max_leaves=None,
                                                      min_child_weight=None,
                                                      missing=nan,
                                                      monotone_constraints=None,
                                                      n_estimators=100,
                                                      n_jobs=3,
                                                      num_parallel_tree=None,
                                                      predictor=None,
                                                      random_state=None,
                                                      reg_alpha=None,
                                                      reg_lambda=None, ...))]),
             n_jobs=2,
             param_grid={'pca__n_components': [30, 5],
                         'xg__learning_rate': [0.03, 0.05],
                         'xg__max_depth': [10, 6]},
             refit='roc_auc', scoring=['accuracy', 'roc_auc'], verbose=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('sts', StandardScaler()),
                                       ('pca', PCA()),
                                       ('xg',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      callbacks=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      early_stopping_rounds=None,
                                                      enable_categorical=False,
                                                      eval_metric=None,
                                                      gamma=None, gpu_id=None,
                                                      grow_policy=None,
                                                      importance_type=None,
                                                      intera...
                                                      max_leaves=None,
                                                      min_child_weight=None,
                                                      missing=nan,
                                                      monotone_constraints=None,
                                                      n_estimators=100,
                                                      n_jobs=3,
                                                      num_parallel_tree=None,
                                                      predictor=None,
                                                      random_state=None,
                                                      reg_alpha=None,
                                                      reg_lambda=None, ...))]),
             n_jobs=2,
             param_grid={'pca__n_components': [30, 5],
                         'xg__learning_rate': [0.03, 0.05],
                         'xg__max_depth': [10, 6]},
             refit='roc_auc', scoring=['accuracy', 'roc_auc'], verbose=2)
Pipeline(steps=[('sts', StandardScaler()), ('pca', PCA()),
                ('xg',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               gamma=None, gpu_id=None, grow_policy=None,
                               importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_bin=None, max_cat_to_onehot=None,
                               max_delta_step=None, max_depth=None,
                               max_leaves=None, min_child_weight=None,
                               missing=nan, monotone_constraints=None,
                               n_estimators=100, n_jobs=3,
                               num_parallel_tree=None, predictor=None,
                               random_state=None, reg_alpha=None,
                               reg_lambda=None, ...))])
StandardScaler()
PCA()
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, gamma=None,
              gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=3,
              num_parallel_tree=None, predictor=None, random_state=None,
              reg_alpha=None, reg_lambda=None, ...)
Out[109]:
0.8580585439999898
In [110]:
# Make predictions
y_pred_dr = clf_dr.predict(Xtest)
In [111]:
# F1 score
f1_score(y_test,y_pred_dr, average ='micro')      
     
f1_score(y_test,y_pred, average ='micro')
Out[111]:
0.9075075965315349
Out[111]:
0.9075075965315349

Random Search¶

In [114]:
import os
import gc
import random

# Used in Randomized parameter search
from scipy.stats import uniform
In [115]:
##################### EE. Randomized Search #################

# Tune parameters using randomized search
# Hyperparameters to tune and their ranges
parameters = {'xg__learning_rate':  uniform(0, 1),
              'xg__n_estimators':   range(50,300),
              'xg__max_depth':      range(3,10),
              'pca__n_components' : range(8,10)}
In [116]:
# Tune parameters using random search
#     Create the object first

rs = RandomizedSearchCV(
                          pipe_xg,
                          param_distributions=parameters,
                          scoring= ['roc_auc', 'accuracy'],
                          n_iter=4,           # Max combination of
                                              # parameter to try. Default = 10
                          verbose = 1,
                          refit = 'roc_auc',
                          n_jobs = 2,          # Use parallel cpu threads
                          cv = 2               # No of folds.
                                              # So n_iter * cv combinations
                        )
In [117]:
start = time.time()
rs.fit(X_train, y_train)
end = time.time()
print()
(end - start)/60
Fitting 2 folds for each of 4 candidates, totalling 8 fits
[22:38:56] WARNING: C:/Users/administrator/workspace/xgboost-win64_release_1.6.0/src/learner.cc:627: 
Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Out[117]:
RandomizedSearchCV(cv=2,
                   estimator=Pipeline(steps=[('sts', StandardScaler()),
                                             ('pca', PCA()),
                                             ('xg',
                                              XGBClassifier(base_score=None,
                                                            booster=None,
                                                            callbacks=None,
                                                            colsample_bylevel=None,
                                                            colsample_bynode=None,
                                                            colsample_bytree=None,
                                                            early_stopping_rounds=None,
                                                            enable_categorical=False,
                                                            eval_metric=None,
                                                            gamma=None,
                                                            gpu_id=None,
                                                            grow_policy=None,
                                                            importance_type=None,...
                                                            predictor=None,
                                                            random_state=None,
                                                            reg_alpha=None,
                                                            reg_lambda=None, ...))]),
                   n_iter=4, n_jobs=2,
                   param_distributions={'pca__n_components': range(8, 10),
                                        'xg__learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001F04419DA90>,
                                        'xg__max_depth': range(3, 10),
                                        'xg__n_estimators': range(50, 300)},
                   refit='roc_auc', scoring=['roc_auc', 'accuracy'], verbose=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=2,
                   estimator=Pipeline(steps=[('sts', StandardScaler()),
                                             ('pca', PCA()),
                                             ('xg',
                                              XGBClassifier(base_score=None,
                                                            booster=None,
                                                            callbacks=None,
                                                            colsample_bylevel=None,
                                                            colsample_bynode=None,
                                                            colsample_bytree=None,
                                                            early_stopping_rounds=None,
                                                            enable_categorical=False,
                                                            eval_metric=None,
                                                            gamma=None,
                                                            gpu_id=None,
                                                            grow_policy=None,
                                                            importance_type=None,...
                                                            predictor=None,
                                                            random_state=None,
                                                            reg_alpha=None,
                                                            reg_lambda=None, ...))]),
                   n_iter=4, n_jobs=2,
                   param_distributions={'pca__n_components': range(8, 10),
                                        'xg__learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001F04419DA90>,
                                        'xg__max_depth': range(3, 10),
                                        'xg__n_estimators': range(50, 300)},
                   refit='roc_auc', scoring=['roc_auc', 'accuracy'], verbose=1)
Pipeline(steps=[('sts', StandardScaler()), ('pca', PCA()),
                ('xg',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               gamma=None, gpu_id=None, grow_policy=None,
                               importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_bin=None, max_cat_to_onehot=None,
                               max_delta_step=None, max_depth=None,
                               max_leaves=None, min_child_weight=None,
                               missing=nan, monotone_constraints=None,
                               n_estimators=100, n_jobs=3,
                               num_parallel_tree=None, predictor=None,
                               random_state=None, reg_alpha=None,
                               reg_lambda=None, ...))])
StandardScaler()
PCA()
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, gamma=None,
              gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=3,
              num_parallel_tree=None, predictor=None, random_state=None,
              reg_alpha=None, reg_lambda=None, ...)

Out[117]:
0.701155138015747
In [118]:
# Evaluate
f"Best score: {rs.best_score_} " ;print()           
f"Best parameter set: {rs.best_params_} " ; print()


#  Make predictions from the best returned model
y_pred = rs.predict(X_test)


# Accuracy and f1_score
accuracy = accuracy_score(y_test, y_pred)
f"Accuracy: {accuracy * 100.0}"   ; print()      
f"f1 score: {f1_score(y_test,y_pred,average ='micro') }" ; print()
Out[118]:
'Best score: 0.501027006832063 '

Out[118]:
"Best parameter set: {'pca__n_components': 9, 'xg__learning_rate': 0.711485982806732, 'xg__max_depth': 8, 'xg__n_estimators': 121} "

Out[118]:
'Accuracy: 89.42414585340546'

Out[118]:
'f1 score: 0.8942414585340547'

Cross Validation¶

In [119]:
# Cross-validation is a more reliable validation technique than just one train/test split. 
# Here we'll resort to ShuffleSplit to create 5 70%/30% splits
from sklearn.model_selection import ShuffleSplit, KFold
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=17)
In [120]:
from sklearn.model_selection import cross_val_score
In [121]:
from sklearn.ensemble import RandomForestClassifier
In [122]:
from sklearn.ensemble import RandomForestClassifier
#We'll train 2 versions of the RandomForestClassifier model - first with default capacity (trees are not limited in depth), 
#second - with min_samples_leaf=3, i.e. each leave is obliged to have at least 3 instances.
#%%time
model_rf1 = RandomForestClassifier(n_estimators=100, n_jobs=4,
                                   max_depth=None, random_state=17)

# calcuate ROC-AUC for each split
cv_scores_rf1 = cross_val_score(model_rf1, X, y, cv=cv, scoring='roc_auc')
In [123]:
model_rf2 = RandomForestClassifier(n_estimators=100, n_jobs=4,
                                   min_samples_leaf=3, random_state=17)

cv_scores_rf2 = cross_val_score(model_rf2, X, y, cv=cv, 
                                scoring='roc_auc', n_jobs=-1)
In [124]:
#The result returned by cross_val_score is an array with metric values (ROC-AUC) for each split:
cv_scores_rf1
Out[124]:
array([0.51963069, 0.50549526, 0.51712541, 0.52198077, 0.51389309])
In [125]:
cv_scores_rf2
Out[125]:
array([0.52748311, 0.51399317, 0.52109925, 0.51319326, 0.51747239])
In [126]:
#compare average ROC-AUC among all splits for both models.
print('Model 1 mean score:', cv_scores_rf1.mean())
print('Model 2 mean score:', cv_scores_rf2.mean())
Model 1 mean score: 0.5156250436555302
Model 2 mean score: 0.5186482383475146

Structure¶

In [165]:
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import LabelEncoder
In [173]:
df = pd.read_csv("C:\\Users\\Lenovo\\Desktop\\Loan Details.csv")
In [174]:
#  Columns in num_data that are either discrete (with few levels)
#     or numeric
cols=['Loan Amount','Funded Amount','Total Accounts','Total Collection Amount','Total Current Balance','Total Revolving Credit Limit','Loan Status','Funded Amount Investor','Interest Rate','Recoveries','Collection Recovery Fee','Accounts Delinquent']
In [175]:
# Create an instance of StandardScaler object
ss = StandardScaler()
le= LabelEncoder()
In [176]:
# Create copy of DataFrame
df['Loan Status']= le.fit_transform(df['Loan Status'])               # Transform boolean to integer
In [177]:
df
Out[177]:
ID Loan Amount Funded Amount Funded Amount Investor Term Batch Enrolled Interest Rate Grade Sub Grade Employment Duration ... Recoveries Collection Recovery Fee Collection 12 months Medical Application Type Last week Pay Accounts Delinquent Total Collection Amount Total Current Balance Total Revolving Credit Limit Loan Status
0 65087372 10000 32236 12329.36286 59 BAT2522922 11.135007 B C4 MORTGAGE ... 2.498291 0.793724 0 INDIVIDUAL 49 0 31 311301 6619 0
1 1450153 3609 11940 12191.99692 59 BAT1586599 12.237563 C D3 RENT ... 2.377215 0.974821 0 INDIVIDUAL 109 0 53 182610 20885 0
2 1969101 28276 9311 21603.22455 59 BAT2136391 12.545884 F D4 MORTGAGE ... 4.316277 1.020075 0 INDIVIDUAL 66 0 34 89801 26155 0
3 6651430 11170 6954 17877.15585 59 BAT2428731 16.731201 C C3 MORTGAGE ... 0.107020 0.749971 0 INDIVIDUAL 39 0 40 9189 60214 0
4 14354669 16890 13226 13539.92667 59 BAT5341619 15.008300 C D4 MORTGAGE ... 1294.818751 0.368953 0 INDIVIDUAL 18 0 430 126029 22579 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
67458 16164945 13601 6848 13175.28583 59 BAT3193689 9.408858 C A4 MORTGAGE ... 564.614852 0.865230 0 INDIVIDUAL 69 0 48 181775 34301 1
67459 35182714 8323 11046 15637.46301 59 BAT1780517 9.972104 C B3 RENT ... 2.015494 1.403368 0 INDIVIDUAL 14 0 37 22692 8714 0
67460 16435904 15897 32921 12329.45775 59 BAT1761981 19.650943 A F3 MORTGAGE ... 5.673092 1.607093 0 INDIVIDUAL 137 0 17 176857 42330 0
67461 5300325 16567 4975 21353.68465 59 BAT2333412 13.169095 D E3 OWN ... 1.157454 0.207608 0 INDIVIDUAL 73 0 61 361339 39075 0
67462 65443173 15353 29875 14207.44860 59 BAT1930365 16.034631 B D1 MORTGAGE ... 1.856480 0.366386 0 INDIVIDUAL 54 0 47 196960 66060 0

67463 rows × 35 columns

In [178]:
#  Use fit and transform method
nc = ss.fit_transform(data.loc[:,cols])
In [179]:
# Transform numpy array back to pandas dataframe
#        as we will be using pandas plotting functions
nc = pd.DataFrame(nc, columns = cols)
nc.head(2)
Out[179]:
Loan Amount Funded Amount Total Accounts Total Collection Amount Total Current Balance Total Revolving Credit Limit Loan Status Funded Amount Investor Interest Rate Recoveries Collection Recovery Fee Accounts Delinquent
0 -0.818483 2.020064 -1.397725 -0.155120 1.091309 -0.789041 -0.319281 -0.337854 -0.191268 -0.160195 -0.094966 0.0
1 -1.582243 -0.469958 -0.676500 -0.125565 0.165689 -0.106997 -0.319281 -0.358098 0.105229 -0.160534 -0.043073 0.0
In [180]:
#  Add/overwrite few columns that are discrete
#        These columns were not to be scaled
nc['Loan Status'] = data['Loan Status']
nc['Accounts Delinquent'] = data['Accounts Delinquent']
In [181]:
# Parallel coordinates chart
fig1 = plt.figure()
pd.plotting.parallel_coordinates(nc,
                                 'Loan Status',    # class_column
                                  colormap='winter'
                                  )
plt.xticks(rotation=90)
plt.title("Parallel chart with data")
Out[181]:
<AxesSubplot:>
Out[181]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 [Text(0, 0, 'Loan Amount'),
  Text(1, 0, 'Funded Amount'),
  Text(2, 0, 'Total Accounts'),
  Text(3, 0, 'Total Collection Amount'),
  Text(4, 0, 'Total Current Balance'),
  Text(5, 0, 'Total Revolving Credit Limit'),
  Text(6, 0, 'Funded Amount Investor'),
  Text(7, 0, 'Interest Rate'),
  Text(8, 0, 'Recoveries'),
  Text(9, 0, 'Collection Recovery Fee'),
  Text(10, 0, 'Accounts Delinquent')])
Out[181]:
Text(0.5, 1.0, 'Parallel chart with data')
In [183]:
# Andrews charts
fig3 = plt.figure()
pd.plotting.andrews_curves(nc,
                           'Loan Status',
                           colormap = 'winter')

plt.title("Andrews plots with  data")
Out[183]:
<AxesSubplot:>
Out[183]:
Text(0.5, 1.0, 'Andrews plots with  data')
In [184]:
#  Radviz plot
fig5 = plt.figure()
pd.plotting.radviz(nc,
                   class_column ='Loan Status',
                   colormap= plt.cm.winter,
                   alpha = 0.4
                   )
Out[184]:
<AxesSubplot:>
In [ ]: